package com.lyf.zone;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.collections.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.List;

/**
 * @author: yjx
 * @date: 2022年05月09日 17:23
 * @Description: 贝壳二手房
 */
public class TestProcessor implements PageProcessor {
  private Site site =
      Site.me().setCharset("UTF-8").setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

  @Override
  public void process(Page page) {
    if (page.getHtml() != null) {
      // System.out.printf("测试：：：："+page.getHtml().toString());
    }
    List<String> all = page.getHtml()
        .regex("[a-zA-z]+://tj.ke.com/ershoufang/[1-9]\\d*\\.html").all();
    page.addTargetRequests(all);
    if (page.getRequest().getUrl().equals("https://tj.ke.com/ershoufang/")){
    }

    String s = page.getHtml().regex("/ershoufang/pg[1-9]\\d*").get();
    Selectable xpath = page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[5]/div[2]/div");
    if (s != null){
      Selectable xpath1 = new Html(xpath.toString()).xpath("//div[@class='page-box house-lst-page-box']/@page-data");
      page.putField("下一页",xpath1.toString());
      JSONObject jsonObject = JSON.parseObject(xpath1.toString());

//    Integer totalPage = jsonObject.getInteger("totalPage");
      Integer totalPage = 2;

      Integer curPage = jsonObject.getInteger("curPage");
      List<String> urls = new ArrayList<String>();
      while (totalPage > curPage){
        curPage++;
        String nextPage = "https://tj.ke.com/ershoufang/pg"+curPage;
        urls.add(nextPage);
      }
      if (CollectionUtils.isNotEmpty(urls)){
        page.addTargetRequests(urls);
      }
    }




//    List<Selectable> list = page.getHtml().$("." + className).nodes();
    page.putField("title",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[2]/div[2]/div/div/div[1]/h1/text()"));
    page.putField("total",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[2]/div[2]/div/span[1]/text()"));
    page.putField("unit",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[2]/div[2]/div/span[2]/span/text()"));
    page.putField("所在区",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[2]/div[4]/div[2]/span[2]/a[1]/text()"));
    page.putField("区域",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[2]/div[4]/div[2]/span[2]/a[2]/text()"));
    page.putField("小区",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[2]/div[4]/div[1]/a[1]/text()"));




//    page.putField("title",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[4]/ul/li[30]/div/div[1]/a/text()"));
//    page.putField("total",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[4]/ul/li[30]/div/div[2]/div[5]/div[1]/span/text()"));
//    page.putField("unit",page.getHtml().xpath("//*[@id=\"beike\"]/div[1]/div[4]/div[1]/div[2]/div[2]/div/span[2]/span/text()"));

  }

  @Override
  public Site getSite() {
    return site.setUserAgent(
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
  }

  public static void main(String[] args) {
    Spider.create(new TestProcessor()).addUrl("https://tj.ke.com/ershoufang")
        .addPipeline(new ConsolePipeline()).thread(10).run();
  }
}
